# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import squarify
import scipy
from scipy import stats
from plotly.tools import FigureFactory as FF
import chart_studio.plotly as py
purchase=pd.read_csv("C:/Users/ypleow/Desktop/Virtual Intern/Quantium/Task 1/QVI_purchase_behaviour.csv");
purchase.head(2)
purchase.info()
purchase['LYLTY_CARD_NBR'].nunique()
purchase.describe(include='all')
plt.figure(figsize=(6,8))
plt.bar(purchase['LIFESTAGE'].value_counts().index,purchase['LIFESTAGE'].value_counts(),
color = sns.color_palette("mako"))
plt.xticks(rotation = 90)
customer_grouped = purchase.groupby('PREMIUM_CUSTOMER')['LYLTY_CARD_NBR'].count().sort_values(ascending=False)
customer_grouped
plt.figure(figsize=(8,8))
colors = ['royalblue', 'darkslateblue', 'slategrey']
#colors = ['#ff9999','#66b3ff','#99ff99']
customer_grouped.plot.pie(autopct="%.1f%%", colors= colors)
plt.figure(figsize=(6,8))
plt.bar(purchase['PREMIUM_CUSTOMER'].value_counts().index,purchase['PREMIUM_CUSTOMER'].value_counts(),
color = sns.color_palette("mako"))
plt.xticks(rotation = 45)
transaction=pd.read_excel("C:/Users/ypleow/Desktop/Virtual Intern/Quantium/Task 1/QVI_transaction_data.xlsx")
transaction.head(2)
transaction.describe(include='all')
transaction['PROD_NBR'].nunique()
transaction['STORE_NBR'].nunique()
transaction['TXN_ID'].nunique()
transaction.info()
transaction["DATE"]=pd.to_datetime(transaction["DATE"], origin = "1899-12-30",unit="D")
transaction.head(2)
transaction['DATE'].describe()
pd.date_range(start = '2018-07-01', end = '2019-06-30' ).difference(transaction.DATE)
Since the "DATE" column has 364 unique value, it indicates that there is one missing date. 2018-12-25 is identified as the missing dates, this might because the shop is closed during the Chirstmas.
transaction['PROD_QTY'].value_counts()
transaction.drop(labels=transaction[transaction['PROD_QTY']==200].index,inplace=True)
transaction.info()
transaction['PROD_NAME'].value_counts()
transaction['PACKAGE_SIZE']=transaction['PROD_NAME'].str.replace("[^\d]+", '')
transaction.head()
sorted(transaction['PROD_NAME'].unique())
transaction['BRAND']=[a.split()[0] for a in transaction['PROD_NAME']]
sorted(transaction['BRAND'].unique())
transaction['BRAND'].replace('Dorito','Doritos',inplace=True)
transaction['BRAND'].replace('Infzns','Infuzions',inplace=True)
transaction['BRAND'].replace('Natural','NCC',inplace=True)
transaction['BRAND'].replace('Old','Old El Paso',inplace=True)
transaction['BRAND'].replace('Red','RRD',inplace=True)
transaction['BRAND'].replace('Smith','Smiths',inplace=True)
transaction['BRAND'].replace('Snbts','Sunbites',inplace=True)
transaction['BRAND'].replace('WW','Woolworths',inplace=True)
sorted(transaction['BRAND'].unique())
transaction["PRICE"]= transaction["TOT_SALES"]/transaction["PROD_QTY"]
transaction.head()
transaction.describe(include='all')
plt.figure(figsize=(10,6))
#plt.xticks(rotation=90)
#sns.countplot(transaction['PACKAGE_SIZE'])
plt.bar(transaction['PACKAGE_SIZE'].value_counts().index,transaction['PACKAGE_SIZE'].value_counts(),
color = sns.color_palette("mako"))
plt.xticks(rotation = 90)
plt.figure(figsize=(10,6))
sns.barplot(x='PACKAGE_SIZE',y='PROD_QTY',data=transaction,estimator=np.sum)
plt.xticks(rotation = 45)
plt.figure(figsize=(10,6))
plt.bar(transaction['BRAND'].value_counts().index,transaction['BRAND'].value_counts(),
color = sns.color_palette("mako"))
plt.xticks(rotation = 90)
plt.figure(figsize=(10,6))
sns.barplot(x='BRAND',y='PROD_QTY',data=transaction,estimator=np.sum)
plt.xticks(rotation = 45)
grouped = transaction.groupby(['BRAND', 'PACKAGE_SIZE']).size().sort_values(ascending=False).head(10)
grouped
plt.figure(figsize=(8, 8))
sns.heatmap(transaction.corr(),cmap='twilight',annot=True)
plt.xticks(rotation = 45)
plt.yticks(rotation = 45)
df=pd.merge(transaction,purchase,on='LYLTY_CARD_NBR')
df.head(2)
df.info()
#df.isna().sum()
df['PACKAGE_SIZE'] = df.PACKAGE_SIZE.astype(int)
df['TXN_ID'] = df.TXN_ID.astype(str)
date_grouped = df.groupby('DATE').count()[['TXN_ID']]
date_grouped
date1_grouped = df.groupby('DATE').sum()[['PROD_QTY']]
date1_grouped
# Create figure and plot space
fig, ax = plt.subplots(figsize=(16, 8))
# Add x-axis and y-axis
ax.bar(date_grouped.index.values,
date_grouped['TXN_ID'],
color='lightslategrey')
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(date1_grouped.index.values,
date1_grouped['PROD_QTY'],
color='crimson')
ax2.set_ylabel("Total Quantity",color="crimson")
ax2.set_ylim(ymin=1000)
# Set title and labels for axes
ax.set(xlabel="Date",
title="Daily Transaction")
ax.set_ylabel("Total Transaction",color='lightslategrey')
ax.set_ylim(ymin=500)
plt.show()
Result: Spike of total transaction and total quantity sold in the end of year 2018. Total Quantity sold in mid of August 2018 and May 2019 drop significantly but the transaction remain around the range.
df.loc[df['PACKAGE_SIZE'] <= 300, 'PACKAGE_SIZECATEGORY'] = 'Between 150 and 300'
df.loc[df['PACKAGE_SIZE'] <= 150, 'PACKAGE_SIZECATEGORY'] = 'Below 150'
df.loc[df['PACKAGE_SIZE'] > 300, 'PACKAGE_SIZECATEGORY'] = 'Above 300'
df['PACKAGE_SIZECATEGORY'].value_counts()
premium_grouped = df.groupby('PREMIUM_CUSTOMER')['TOT_SALES'].sum().sort_values(ascending=False)
premium_grouped
plt.figure(figsize=(8,8))
colors = ['steelblue','lightseagreen','mediumaquamarine']
premium_grouped.plot.pie(autopct="%.1f%%", colors=colors)
# Results: Mainstream has higher total sales followed by budget and premium.
plt.figure(figsize=(10,6))
sns.barplot(x='PREMIUM_CUSTOMER',y='TOT_SALES',data=df,estimator=np.sum, palette='viridis')
plt.figure(figsize=(10,6))
sns.barplot(x='LIFESTAGE',y='TOT_SALES',data=df,estimator=np.sum, palette='mako')
plt.xticks(rotation = 45)
# Results: Older's lifestage (Older Families, Older Singles/ Couples and Retirees) contribute the most total sales
# while New Families has lowest total sales.
df_grouped = df.groupby(['LIFESTAGE','PREMIUM_CUSTOMER']).sum()[['TOT_SALES']]
df_grouped
plt.figure(figsize=(16,8))
sns.barplot(y=df_grouped.reset_index()['TOT_SALES'],x=df_grouped.reset_index()['LIFESTAGE'],
hue=df_grouped.reset_index()['PREMIUM_CUSTOMER'], palette='rocket')
Mainstream segment and Older Singles/ Couples segments have the highest total sales when looking the premium_customer and lifestage separately. When deep dive into smaller segmentation, Budget older families contribute the highest total sales followed by Mainstream young Singles/ Couples and Mainstram Retirees.
cust_grouped =df.groupby(['TXN_ID','LIFESTAGE','PREMIUM_CUSTOMER']).mean()[['PROD_QTY']]
cust_grouped
# Compare lifestage and premium customer by the quantity purchased
# Results: Older families, Young Families and Older Singles/Couples have overall higher average quantity purchased.
# Every categories has more than 1 average quantity purchased.
customer_grouped = cust_grouped.groupby(['LIFESTAGE','PREMIUM_CUSTOMER']).mean()[['PROD_QTY']]
customer_grouped
plt.figure(figsize=(16,8))
sns.barplot(y=customer_grouped.reset_index()['PROD_QTY'],x=customer_grouped.reset_index()['LIFESTAGE'],
hue=df_grouped.reset_index()['PREMIUM_CUSTOMER'], palette='mako')
plt.ylim(1.6, 2)
premium_grouped = cust_grouped.groupby(['PREMIUM_CUSTOMER']).mean()[['PROD_QTY']]
#premium_grouped
premium_grouped.sort_values("PROD_QTY",ascending=False)
lifestage_grouped = df.groupby(['LIFESTAGE']).mean()[['PROD_QTY']]
#lifestage_grouped
lifestage_grouped.sort_values("PROD_QTY",ascending=False)
Older Families and Young Families has higher quantity purchased per transaction.
size_grouped = df.groupby(['LIFESTAGE','PACKAGE_SIZECATEGORY'])['LIFESTAGE'].count()
size_grouped
size1_grouped = df.groupby(['PREMIUM_CUSTOMER','PACKAGE_SIZECATEGORY'])['PREMIUM_CUSTOMER'].count()
size1_grouped
size2_grouped = df.groupby('PACKAGE_SIZECATEGORY')['PACKAGE_SIZECATEGORY'].count().sort_values(ascending=False)
size2_grouped
a = df.groupby('PACKAGE_SIZECATEGORY')['PACKAGE_SIZECATEGORY'].count().sort_values(ascending=False).index.get_level_values(0).tolist()
color=['lightpink','aliceblue','thistle']
squarify.plot(size2_grouped, color = color, label = a)
plt.show()
Package size between 150 and 300 has the most quantity of purchased.
price_grouped = df.groupby(['LIFESTAGE','PREMIUM_CUSTOMER']).mean()[['PRICE']]
price_grouped.sort_values("PRICE",ascending=False)
plt.figure(figsize=(16,8))
sns.barplot(y=price_grouped.reset_index()['PRICE'],x=price_grouped.reset_index()['LIFESTAGE'],
hue=price_grouped.reset_index()['PREMIUM_CUSTOMER'], palette='mako')
plt.ylim(3,4.2)
Mainstream Young Singles/ Couples and Midage Singles/ Couples are willing to spend more per packet of chips.
# Mainstream vs Premium
twosample_results = scipy.stats.ttest_ind([4.071485,4.000101],[3.645518,3.752915])
matrix_twosample = [
['Young & Midage Singles/ Couples', 'Test Statistic', 'p-value'],
['Mainstream vs Premium', twosample_results[0], twosample_results[1]]]
twosample_table = FF.create_table(matrix_twosample, index=True)
twosample_table.show()
# Mainstream vs Budget
twosample_results = scipy.stats.ttest_ind([4.071485,4.000101],[3.637681,3.728496])
matrix_twosample = [
['Young & Midage Singles/ Couples', 'Test Statistic', 'p-value'],
['Mainstream vs Budget', twosample_results[0], twosample_results[1]]]
twosample_table = FF.create_table(matrix_twosample, index=True)
twosample_table.show()
The p-values are below 0.05, hence the average per packet price for mainstream is higher than premium and budget.
young=df[(df['PREMIUM_CUSTOMER']=='Mainstream') & (df['LIFESTAGE']=='YOUNG SINGLES/COUPLES')]
columns = ['BRAND', 'PACKAGE_SIZE']
young = pd.DataFrame(young, columns=columns)
young.groupby(['BRAND','PACKAGE_SIZE'])['BRAND'].count().sort_values(ascending=False).head(5)
#print(f"YOUNG SINGLES/COUPLES\n BRAND\n{young['BRAND'].value_counts().head(5)}")
midage=df[(df['PREMIUM_CUSTOMER']=='Mainstream') & (df['LIFESTAGE']=='MIDAGE SINGLES/COUPLES')]
columns = ['BRAND', 'PACKAGE_SIZE']
midage = pd.DataFrame(midage, columns=columns)
midage.groupby(['BRAND','PACKAGE_SIZE'])['BRAND'].count().sort_values(ascending=False).head(5)
#print(f"MIDAGE SINGLES/COUPLES\n BRAND\n{midage['BRAND'].value_counts().head(5)}")
Both young and midage singles/ couples in mainstream prefered Pringles with 134g package size followes by Kettle with 150 and 175g package size.